# 抓取官网最新文档内容

from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError
from time import ctime, sleep
import pymysql, time, math, threading, logging, os

# 数据库连接信息
dbHost = '127.0.0.1'
dbUser = 'dengqihua'
dbPwd = 'dengqihua520'
dbName = 'phpdoc'
dbCharset = 'utf8'

# 官网文档网址
offcialDocUrl = 'http://php.net/manual/zh/'

# 创建数据库连接对象
dbConn = pymysql.connect(dbHost, dbUser, dbPwd, dbName, charset=dbCharset)

# 创建游标对象
curserObj = dbConn.cursor()

exitFlag = 0

# 开启日志
logger = logging.getLogger()
# 指定logger输出格式
formatter = logging.Formatter('%(asctime)s %(levelname)-8s: %(message)s')
# 文件日志
logsDir = '/var/log/'
logfilename = logsDir + 'phpdoc_python_gradeNewsDoc_' + str(time.strftime("%Y-%m-%d", time.localtime())) + '.log'
file_handler = logging.FileHandler(filename=logfilename, encoding="utf-8")
# 可以通过setFormatter指定输出格式
file_handler.setFormatter(formatter)
# 为logger添加的日志处理器
logger.addHandler(file_handler)
# 指定日志的最低输出级别，默认为WARN级别
logger.setLevel(logging.INFO)


class gradeThread(threading.Thread):
    def __init__(self, threadID, name, starts, limit):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.starts = starts
        self.limit = limit

    def run(self):
        logger.info('开始线程：' + self.name)

        doUpdate(self.name, self.starts, self.limit)

        logger.info('退出线程：' + self.name)


# 获取数据库记录的文档最后更新时间
def getLastUpdateTime():
    sql = 'SELECT last_update_time FROM p_configs'
    curserObj.execute(sql)
    lastUpdateTime = str(curserObj.fetchone()[0])
    logger.info('数据库最后更新时间：' + lastUpdateTime)
    return lastUpdateTime


# 抓取官网记录的最后更新时间
def gradePubdate():
    html = urlopen(offcialDocUrl + 'index.php')
    bsObj = BeautifulSoup(html.read(), "html.parser")
    pubdate = str(bsObj.find('div', {'class': 'pubdate'}).get_text())
    logger.info('官网最后更新时间：' + pubdate)
    return pubdate


# 抓取官网指定文件名的主体内容
def gradeDocumentContent(filename, threadName):
    try:
        html = urlopen(offcialDocUrl + filename + '.php')
    except HTTPError as e:
        logger.info('线程 - ' + threadName + '抓取：' + filename + '时，网络出错')
        return None

    try:
        bsObj = BeautifulSoup(html.read(), 'html.parser')
        content = bsObj.find('div', {'class': 'page-tools'}).next_sibling
        return content
    except AttributeError as e:
        logger.info('线程 - ' + threadName + '抓取：' + filename + '的属性时出错')
        return None


# 获取数据库总记录数
def getDocumentTotal():
    sql = 'SELECT COUNT(*) FROM p_documents_copy'
    curserObj.execute(sql)
    total = curserObj.fetchone()[0]
    logger.info('数据库总记录数：' + str(total))
    return total


# 更新数据库配置的最后更新时间
def updateLastUpdateTime(pubdate):
    dbConn = pymysql.connect(dbHost, dbUser, dbPwd, dbName, charset=dbCharset)
    curserObj = dbConn.cursor()
    try:
        sql = 'UPDATE p_configs SET last_update_time= %s WHERE status = 1'
        curserObj.execute(sql, (pubdate))
        # 提交到数据库执行
        dbConn.commit()
        logger.info('数据库配置的最后更新时间更新为：' + pubdate)
    except:
        # 发生错误时回滚
        dbConn.rollback()

    curserObj.close()
    dbConn.close()


# 多线程执行更新文档
def doUpdate(threadName, starts, limit):
    dbConn = pymysql.connect(dbHost, dbUser, dbPwd, dbName, charset=dbCharset)
    curserObj = dbConn.cursor()

    sql = 'SELECT file_name FROM p_documents_copy WHERE status = 1 LIMIT %s,%s' % (starts, limit)
    curserObj.execute(sql)
    list = curserObj.fetchall()

    if len(list) > 0:
        for row in list:
            filename = row[0]
            content = gradeDocumentContent(filename, threadName)
            if content == None:
                continue
            else:
                content = str(content)

            nowTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

            logger.info('线程 - ' + threadName + '正在更新：' + filename)

            # 提交到数据库执行
            try:
                sql = 'UPDATE p_documents_copy SET content= %s, update_time = %s WHERE file_name = %s'
                curserObj.execute(sql, (content, nowTime, filename))
                # 提交到数据库执行
                dbConn.commit()
                logger.info('线程 - ' + threadName + '更新：' + filename + ' 成功！')
            except:
                # 发生错误时回滚
                dbConn.rollback()
                logger.info('线程 - ' + threadName + '更新：' + filename + ' 失败！')

            sleep(0.1)

    curserObj.close()
    dbConn.close()


pubdate = gradePubdate()
lastUpdateTime = getLastUpdateTime()

# 判断是否需要更新
if pubdate <= lastUpdateTime:
    # 不需要更新数据库
    logger.info('不需要更新')

    # 关闭游标和数据库
    curserObj.close()
    dbConn.close()
else:
    logger.info('需要执行更新!')

    # 文档总记录数
    total = getDocumentTotal()

    # 关闭游标和数据库
    curserObj.close()
    dbConn.close()

    # 同时执行线程数
    processNum = 1

    # 每个线程执行的记录数
    limit = math.ceil(total / processNum)

    logger.info('同时执行' + str(processNum) + '个线程，每个线程执行' + str(limit) + '条记录')

    threads = []

    for i in range(processNum):
        starts = i * limit

        thread = gradeThread(i, 'GradeThread-' + str(i), starts, limit)
        thread.start()
        threads.append(thread)

    for i in threads:
        i.join()

    logger.info('全部线程执行完！')

    # 更新数据库最后一次更新时间
    # updateLastUpdateTime(pubdate)
